<!DOCTYPE html>
<html lang="en-us">
  <head>

    <meta http-equiv="content-type" content="text/html; charset=utf-8">
    
<meta charset="UTF-8">
<title>Tidying Up Input Text | Elasticsearch: The Definitive Guide [2.x] | Elastic</title>
<link rel="home" href="index.html" title="Elasticsearch: The Definitive Guide [2.x]">
<link rel="up" href="identifying-words.html" title="Identifying Words">
<link rel="prev" href="icu-tokenizer.html" title="icu_tokenizer">
<link rel="next" href="token-normalization.html" title="Normalizing Tokens">
<meta name="DC.type" content="Learn/Docs/Legacy/Elasticsearch/Definitive Guide/2.x">
<meta name="DC.subject" content="Elasticsearch">
<meta name="DC.identifier" content="2.x">
<meta name="robots" content="noindex,nofollow">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <script src="https://cdn.optimizely.com/js/18132920325.js"></script>
    <link rel="apple-touch-icon" sizes="57x57" href="/apple-icon-57x57.png">
    <link rel="apple-touch-icon" sizes="60x60" href="/apple-icon-60x60.png">
    <link rel="apple-touch-icon" sizes="72x72" href="/apple-icon-72x72.png">
    <link rel="apple-touch-icon" sizes="76x76" href="/apple-icon-76x76.png">
    <link rel="apple-touch-icon" sizes="114x114" href="/apple-icon-114x114.png">
    <link rel="apple-touch-icon" sizes="120x120" href="/apple-icon-120x120.png">
    <link rel="apple-touch-icon" sizes="144x144" href="/apple-icon-144x144.png">
    <link rel="apple-touch-icon" sizes="152x152" href="/apple-icon-152x152.png">
    <link rel="apple-touch-icon" sizes="180x180" href="/apple-icon-180x180.png">
    <link rel="icon" type="image/png" href="/favicon-32x32.png" sizes="32x32">
    <link rel="icon" type="image/png" href="/android-chrome-192x192.png" sizes="192x192">
    <link rel="icon" type="image/png" href="/favicon-96x96.png" sizes="96x96">
    <link rel="icon" type="image/png" href="/favicon-16x16.png" sizes="16x16">
    <link rel="manifest" href="/manifest.json">
    <meta name="apple-mobile-web-app-title" content="Elastic">
    <meta name="application-name" content="Elastic">
    <meta name="msapplication-TileColor" content="#ffffff">
    <meta name="msapplication-TileImage" content="/mstile-144x144.png">
    <meta name="theme-color" content="#ffffff">
    <meta name="naver-site-verification" content="936882c1853b701b3cef3721758d80535413dbfd">
    <meta name="yandex-verification" content="d8a47e95d0972434">
    <meta name="localized" content="true">
    <meta name="st:robots" content="follow,index">
    <meta property="og:image" content="https://www.elastic.co/static/images/elastic-logo-200.png">
    <link rel="shortcut icon" href="/favicon.ico" type="image/x-icon">
    <link rel="icon" href="/favicon.ico" type="image/x-icon">
    <link rel="apple-touch-icon-precomposed" sizes="64x64" href="/favicon_64x64_16bit.png">
    <link rel="apple-touch-icon-precomposed" sizes="32x32" href="/favicon_32x32.png">
    <link rel="apple-touch-icon-precomposed" sizes="16x16" href="/favicon_16x16.png">
    <!-- Give IE8 a fighting chance -->
    <!--[if lt IE 9]>
    <script src="https://oss.maxcdn.com/html5shiv/3.7.2/html5shiv.min.js"></script>
    <script src="https://oss.maxcdn.com/respond/1.4.2/respond.min.js"></script>
    <![endif]-->
    <link rel="stylesheet" type="text/css" href="/guide/static/styles.css">
  </head>

  <!--© 2015-2021 Elasticsearch B.V. Copying, publishing and/or distributing without written permission is strictly prohibited.-->

  <body>
    <!-- Google Tag Manager -->
    <script>dataLayer = [];</script><noscript><iframe src="//www.googletagmanager.com/ns.html?id=GTM-58RLH5" height="0" width="0" style="display:none;visibility:hidden"></iframe></noscript>
    <script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start': new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0], j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src= '//www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f); })(window,document,'script','dataLayer','GTM-58RLH5');</script>
    <!-- End Google Tag Manager -->

    <!-- Global site tag (gtag.js) - Google Analytics -->
    <script async src="https://www.googletagmanager.com/gtag/js?id=UA-12395217-16"></script>
    <script>
      window.dataLayer = window.dataLayer || [];
      function gtag(){dataLayer.push(arguments);}
      gtag('js', new Date());
      gtag('config', 'UA-12395217-16');
    </script>

    <!--BEGIN QUALTRICS WEBSITE FEEDBACK SNIPPET-->
    <script type="text/javascript">
      (function(){var g=function(e,h,f,g){
      this.get=function(a){for(var a=a+"=",c=document.cookie.split(";"),b=0,e=c.length;b<e;b++){for(var d=c[b];" "==d.charAt(0);)d=d.substring(1,d.length);if(0==d.indexOf(a))return d.substring(a.length,d.length)}return null};
      this.set=function(a,c){var b="",b=new Date;b.setTime(b.getTime()+6048E5);b="; expires="+b.toGMTString();document.cookie=a+"="+c+b+"; path=/; "};
      this.check=function(){var a=this.get(f);if(a)a=a.split(":");else if(100!=e)"v"==h&&(e=Math.random()>=e/100?0:100),a=[h,e,0],this.set(f,a.join(":"));else return!0;var c=a[1];if(100==c)return!0;switch(a[0]){case "v":return!1;case "r":return c=a[2]%Math.floor(100/c),a[2]++,this.set(f,a.join(":")),!c}return!0};
      this.go=function(){if(this.check()){var a=document.createElement("script");a.type="text/javascript";a.src=g;document.body&&document.body.appendChild(a)}};
      this.start=function(){var a=this;window.addEventListener?window.addEventListener("load",function(){a.go()},!1):window.attachEvent&&window.attachEvent("onload",function(){a.go()})}};
      try{(new g(100,"r","QSI_S_ZN_emkP0oSe9Qrn7kF","https://znemkp0ose9qrn7kf-elastic.siteintercept.qualtrics.com/WRSiteInterceptEngine/?Q_ZID=ZN_emkP0oSe9Qrn7kF")).start()}catch(i){}})();
    </script><div id="ZN_emkP0oSe9Qrn7kF"><!--DO NOT REMOVE-CONTENTS PLACED HERE--></div>
    <!--END WEBSITE FEEDBACK SNIPPET-->

    <div id="elastic-nav" style="display:none;"></div>
    <script src="https://www.elastic.co/elastic-nav.js"></script>

    <!-- Subnav -->
    <div>
      <div>
        <div class="tertiary-nav d-none d-md-block">
          <div class="container">
            <div class="p-t-b-15 d-flex justify-content-between nav-container">
              <div class="breadcrum-wrapper"><span><a href="/guide/" style="font-size: 14px; font-weight: 600; color: #000;">Docs</a></span></div>
            </div>
          </div>
        </div>
      </div>
    </div>

    <div class="main-container">
      <section id="content">
        <div class="content-wrapper">

          <section id="guide" lang="en">
            <div class="container">
              <div class="row">
                <div class="col-xs-12 col-sm-8 col-md-8 guide-section">
                  <!-- start body -->
                  <div class="page_header">
<p>
  <strong>WARNING</strong>: The 2.x versions of Elasticsearch have passed their
  <a href="https://www.elastic.co/support/eol">EOL dates</a>. If you are running
  a 2.x version, we strongly advise you to upgrade.
</p>
<p>
  This documentation is no longer maintained and may be removed. For the latest
  information, see the <a href="https://www.elastic.co/guide/en/elasticsearch/reference/current/index.html">current
  Elasticsearch documentation</a>.
</p>
</div>
<div id="content">
<div class="breadcrumbs">
<span class="breadcrumb-link"><a href="index.html">Elasticsearch: The Definitive Guide [2.x]</a></span>
»
<span class="breadcrumb-link"><a href="languages.html">Dealing with Human Language</a></span>
»
<span class="breadcrumb-link"><a href="identifying-words.html">Identifying Words</a></span>
»
<span class="breadcrumb-node">Tidying Up Input Text</span>
</div>
<div class="navheader">
<span class="prev">
<a href="icu-tokenizer.html">« icu_tokenizer</a>
</span>
<span class="next">
<a href="token-normalization.html">Normalizing Tokens »</a>
</span>
</div>
<div class="section">
<div class="titlepage"><div><div>
<h2 class="title">
<a id="char-filters"></a>Tidying Up Input Text<a class="edit_me edit_me_private" rel="nofollow" title="Editing on GitHub is available to Elastic" href="https://github.com/elastic/elasticsearch-definitive-guide/edit/2.x/210_Identifying_words/50_Tidying_text.asciidoc">edit</a>
</h2>
</div></div></div>
<p>Tokenizers produce the best results when the input text is clean, valid
text, where <em>valid</em> means that it follows the punctuation rules that the
Unicode algorithm expects.  Quite often, though, the text we need to process
is anything but clean. Cleaning it up before tokenization improves the quality
of the output.</p>
<div class="section">
<div class="titlepage"><div><div>
<h3 class="title">
<a id="_tokenizing_html"></a>Tokenizing HTML<a class="edit_me edit_me_private" rel="nofollow" title="Editing on GitHub is available to Elastic" href="https://github.com/elastic/elasticsearch-definitive-guide/edit/2.x/210_Identifying_words/50_Tidying_text.asciidoc">edit</a>
</h3>
</div></div></div>
<p>Passing HTML through the <code class="literal">standard</code> tokenizer or the <code class="literal">icu_tokenizer</code> produces
poor results.  These tokenizers just don’t know what to do with the HTML tags.
For example:</p>
<div class="pre_wrapper lang-js">
<pre class="programlisting prettyprint lang-js">GET /_analyzer?tokenizer=standard
&lt;p&gt;Some d&amp;eacute;j&amp;agrave; vu &lt;a href="http://somedomain.com&gt;"&gt;website&lt;/a&gt;</pre>
</div>
<p>The <code class="literal">standard</code> tokenizer confuses HTML tags and entities, and emits the
following tokens: <code class="literal">p</code>, <code class="literal">Some</code>, <code class="literal">d</code>, <code class="literal">eacute</code>, <code class="literal">j</code>, <code class="literal">agrave</code>, <code class="literal">vu</code>, <code class="literal">a</code>,
<code class="literal">href</code>, <code class="literal">http</code>, <code class="literal">somedomain.com</code>, <code class="literal">website</code>, <code class="literal">a</code>.  Clearly not what was
intended!</p>
<p><em>Character filters</em> can be added to an analyzer to preprocess the text
<em>before</em> it is passed to the tokenizer.  In this case, we can use the
<code class="literal">html_strip</code> character filter to remove HTML tags and to decode HTML entities
such as <code class="literal">&amp;eacute;</code> into the corresponding Unicode characters.</p>
<p>Character filters can be tested out via the <code class="literal">analyze</code> API by specifying them
in the query string:</p>
<div class="pre_wrapper lang-js">
<pre class="programlisting prettyprint lang-js">GET /_analyzer?tokenizer=standard&amp;char_filters=html_strip
&lt;p&gt;Some d&amp;eacute;j&amp;agrave; vu &lt;a href="http://somedomain.com&gt;"&gt;website&lt;/a&gt;</pre>
</div>
<p>To use them as part of the analyzer, they should be added to a <code class="literal">custom</code>
analyzer definition:</p>
<div class="pre_wrapper lang-js">
<pre class="programlisting prettyprint lang-js">PUT /my_index
{
    "settings": {
        "analysis": {
            "analyzer": {
                "my_html_analyzer": {
                    "tokenizer":     "standard",
                    "char_filter": [ "html_strip" ]
                }
            }
        }
    }
}</pre>
</div>
<p>Once created, our new <code class="literal">my_html_analyzer</code> can be tested with the <code class="literal">analyze</code> API:</p>
<div class="pre_wrapper lang-js">
<pre class="programlisting prettyprint lang-js">GET /my_index/_analyzer?analyzer=my_html_analyzer
&lt;p&gt;Some d&amp;eacute;j&amp;agrave; vu &lt;a href="http://somedomain.com&gt;"&gt;website&lt;/a&gt;</pre>
</div>
<p>This emits the tokens that we expect: <code class="literal">Some</code>, <code class="literal">déjà</code>, <code class="literal">vu</code>, <code class="literal">website</code>.</p>
</div>

<div class="section">
<div class="titlepage"><div><div>
<h3 class="title">
<a id="_tidying_up_punctuation"></a>Tidying Up Punctuation<a class="edit_me edit_me_private" rel="nofollow" title="Editing on GitHub is available to Elastic" href="https://github.com/elastic/elasticsearch-definitive-guide/edit/2.x/210_Identifying_words/50_Tidying_text.asciidoc">edit</a>
</h3>
</div></div></div>
<p>The <code class="literal">standard</code> tokenizer and <code class="literal">icu_tokenizer</code> both understand that an
apostrophe <em>within</em> a word should be treated as part of the word, while single
quotes that <em>surround</em> a word should not. Tokenizing the text <code class="literal">You're my 'favorite'</code>. would correctly emit the tokens <code class="literal">You're, my, favorite</code>.</p>
<p>Unfortunately, Unicode lists a few characters that are sometimes used
as apostrophes:</p>
<div class="variablelist">
<dl class="variablelist">
<dt>
<span class="term">
<code class="literal">U+0027</code>
</span>
</dt>
<dd>
Apostrophe (<code class="literal">'</code>)—the original ASCII character
</dd>
<dt>
<span class="term">
<code class="literal">U+2018</code>
</span>
</dt>
<dd>
Left single-quotation mark (<code class="literal">‘</code>)—opening quote when single-quoting
</dd>
<dt>
<span class="term">
<code class="literal">U+2019</code>
</span>
</dt>
<dd>
Right single-quotation mark (<code class="literal">’</code>)—closing quote when single-quoting, but also the  preferred character to use as an apostrophe
</dd>
</dl>
</div>
<p>Both tokenizers treat these three characters as an apostrophe (and thus as
part of the word) when they appear within a word. Then there are another three
apostrophe-like characters:</p>
<div class="variablelist">
<dl class="variablelist">
<dt>
<span class="term">
<code class="literal">U+201B</code>
</span>
</dt>
<dd>
Single high-reversed-9 quotation mark (<code class="literal">‛</code>)—same as <code class="literal">U+2018</code> but differs in appearance
</dd>
<dt>
<span class="term">
<code class="literal">U+0091</code>
</span>
</dt>
<dd>
Left single-quotation mark in ISO-8859-1—should not be used in Unicode
</dd>
<dt>
<span class="term">
<code class="literal">U+0092</code>
</span>
</dt>
<dd>
Right single-quotation mark in ISO-8859-1—should not be used in Unicode
</dd>
</dl>
</div>
<p>Both tokenizers treat these three characters as word boundaries—​a place to
break text into tokens. Unfortunately, some publishers use <code class="literal">U+201B</code> as a
stylized way to write names like <code class="literal">M‛coy</code>, and the second two characters may well
be produced by your word processor, depending on its age.</p>
<p>Even when using the “acceptable” quotation marks, a word written with a
single right quotation mark—<code class="literal">You’re</code>—is not the same as the word written
with an apostrophe—<code class="literal">You're</code>—which means that a query for one variant
will not find the other.</p>
<p>Fortunately, it is possible to sort out this mess with the <code class="literal">mapping</code> character
filter, which allows us to replace all instances of one character with
another.  In this case, we will replace all apostrophe variants with the
simple <code class="literal">U+0027</code> apostrophe:</p>
<div class="pre_wrapper lang-js">
<pre class="programlisting prettyprint lang-js">PUT /my_index
{
  "settings": {
    "analysis": {
      "char_filter": { <a id="CO137-1"></a><i class="conum" data-value="1"></i>
        "quotes": {
          "type": "mapping",
          "mappings": [ <a id="CO137-2"></a><i class="conum" data-value="2"></i>
            "\\u0091=&gt;\\u0027",
            "\\u0092=&gt;\\u0027",
            "\\u2018=&gt;\\u0027",
            "\\u2019=&gt;\\u0027",
            "\\u201B=&gt;\\u0027"
          ]
        }
      },
      "analyzer": {
        "quotes_analyzer": {
          "tokenizer":     "standard",
          "char_filter": [ "quotes" ] <a id="CO137-3"></a><i class="conum" data-value="3"></i>
        }
      }
    }
  }
}</pre>
</div>
<div class="calloutlist">
<table border="0" summary="Callout list">
<tr>
<td align="left" valign="top" width="5%">
<p><a href="#CO137-1"><i class="conum" data-value="1"></i></a></p>
</td>
<td align="left" valign="top">
<p>We define a custom <code class="literal">char_filter</code> called <code class="literal">quotes</code> that
maps all apostrophe variants to a simple apostrophe.</p>
</td>
</tr>
<tr>
<td align="left" valign="top" width="5%">
<p><a href="#CO137-2"><i class="conum" data-value="2"></i></a></p>
</td>
<td align="left" valign="top">
<p>For clarity, we have used the JSON Unicode escape syntax
for each character, but we could just have used the
characters themselves: <code class="literal">"‘=&gt;'"</code>.</p>
</td>
</tr>
<tr>
<td align="left" valign="top" width="5%">
<p><a href="#CO137-3"><i class="conum" data-value="3"></i></a></p>
</td>
<td align="left" valign="top">
<p>We use our custom <code class="literal">quotes</code> character filter to create
a new analyzer called <code class="literal">quotes_analyzer</code>.</p>
</td>
</tr>
</table>
</div>
<p>As always, we test the analyzer after creating it:</p>
<div class="pre_wrapper lang-js">
<pre class="programlisting prettyprint lang-js">GET /my_index/_analyze?analyzer=quotes_analyzer
You’re my ‘favorite’ M‛Coy</pre>
</div>
<p>This example returns the following tokens, with all of the in-word
quotation marks replaced by apostrophes: <code class="literal">You're</code>, <code class="literal">my</code>, <code class="literal">favorite</code>, <code class="literal">M'Coy</code>.</p>
<p>The more effort that you put into ensuring that the tokenizer receives good-quality input, the better your search results will be.</p>
</div>

</div>
<div class="navfooter">
<span class="prev">
<a href="icu-tokenizer.html">« icu_tokenizer</a>
</span>
<span class="next">
<a href="token-normalization.html">Normalizing Tokens »</a>
</span>
</div>
</div>

                  <!-- end body -->
                </div>
                <div class="col-xs-12 col-sm-4 col-md-4" id="right_col">
                  <div id="rtpcontainer" style="display: block;">
                    <div class="mktg-promo">
                      <h3>Most Popular</h3>
                      <ul class="icons">
                        <li class="icon-elasticsearch-white"><a href="https://www.elastic.co/webinars/getting-started-elasticsearch?baymax=default&amp;elektra=docs&amp;storm=top-video">Get Started with Elasticsearch: Video</a></li>
                        <li class="icon-kibana-white"><a href="https://www.elastic.co/webinars/getting-started-kibana?baymax=default&amp;elektra=docs&amp;storm=top-video">Intro to Kibana: Video</a></li>
                        <li class="icon-logstash-white"><a href="https://www.elastic.co/webinars/introduction-elk-stack?baymax=default&amp;elektra=docs&amp;storm=top-video">ELK for Logs &amp; Metrics: Video</a></li>
                      </ul>
                    </div>
                  </div>
                </div>
              </div>
            </div>
          </section>

        </div>


<div id="elastic-footer"></div>
<script src="https://www.elastic.co/elastic-footer.js"></script>
<!-- Footer Section end-->

      </section>
    </div>

<script src="/guide/static/jquery.js"></script>
<script type="text/javascript" src="/guide/static/docs.js"></script>
<script type="text/javascript">
  window.initial_state = {}</script>
  </body>
</html>
